In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [2]:
import plotly.express as px
In [3]:
import scipy.stats as stats
In [4]:
def read_data(addr_string,index):
    return pd.read_excel(addr_string, index)
In [5]:
path = r'C:\Users\rishi\Downloads\Projects\Bank Loans\Bank_Personal_Loan_Modelling.xlsx'
df = read_data(path,1)
In [6]:
df.shape
Out[6]:
(5000, 14)
In [7]:
df.isnull().sum()
Out[7]:
ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64
In [8]:
df.drop(['ID', 'ZIP Code'] , axis = 1, inplace = True )
In [9]:
df.head()
Out[9]:
Age Experience Income Family CCAvg Education Mortgage Personal Loan Securities Account CD Account Online CreditCard
0 25 1 49 4 1.6 1 0 0 1 0 0 0
1 45 19 34 3 1.5 1 0 0 1 0 0 0
2 39 15 11 1 1.0 1 0 0 0 0 0 0
3 35 9 100 1 2.7 2 0 0 0 0 0 0
4 35 8 45 4 1.0 2 0 0 0 0 0 1
In [10]:
df.columns
Out[10]:
Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')
In [11]:
px.box(df , y = ['Age', 'Experience', 'Income', 'Family', 'Education'] , title = '5 point summary of the basic data')
In [12]:
df.skew()
Out[12]:
Age                  -0.029341
Experience           -0.026325
Income                0.841339
Family                0.155221
CCAvg                 1.598457
Education             0.227093
Mortgage              2.104002
Personal Loan         2.743607
Securities Account    2.588268
CD Account            3.691714
Online               -0.394785
CreditCard            0.904589
dtype: float64
In [13]:
df.hist(figsize=(20,20) )
Out[13]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED529550>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED031A00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED061E50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED09A310>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED0C6760>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED0F0AF0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED0F0BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED12B0D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED1838E0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED1B0D30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED1E81F0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FCED213640>]],
      dtype=object)
In [14]:
sns.distplot(df['Experience'])
Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fced615a00>
In [15]:
df['Experience'].mean()
Out[15]:
20.1046
In [16]:
Negative_exp = df[df['Experience']<0]
In [17]:
sns.distplot(Negative_exp['Age'])
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fced695f10>
In [18]:
Negative_exp['Experience'].mean()
Out[18]:
-1.4423076923076923
In [19]:
Negative_exp.size
Out[19]:
624
In [20]:
data = df.copy()
In [21]:
data['Experience'] = np.where(data['Experience'] < 0, data['Experience'].mean(), data['Experience'])
In [22]:
data[data['Experience'] < 0]
Out[22]:
Age Experience Income Family CCAvg Education Mortgage Personal Loan Securities Account CD Account Online CreditCard
In [23]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot = True)
plt.title ('Relationship of Variables')
Out[23]:
Text(0.5, 1.0, 'Relationship of Variables')
In [24]:
data = data.drop(['Experience'] , axis = 1)
In [25]:
data['Education'].unique()
Out[25]:
array([1, 2, 3], dtype=int64)
In [26]:
def mark(x):
    if x == 1:
        return 'Undergrad'
    elif x == 2:
        return 'Graduate'
    else:
        return 'Professional'
In [27]:
data['Edu_Mark'] = data['Education'].apply(mark)
In [28]:
edu_dis = data.groupby('Edu_Mark')['Age'].count()
In [29]:
px.pie(data , values = edu_dis , names = edu_dis.index , title = 'Customer Educational Experience') 
In [30]:
def Security_CD(row):
    if (row['Securities Account'] == 1) & (row ['CD Account'] == 1):
        return 'Holds Securities and Deposits'
    elif (row['Securities Account'] == 0) & (row['CD Account'] == 0):
        return "Doesn't hold Securities and Deposits"
    elif (row['Securities Account'] == 1) & (row ['CD Account'] == 0):
        return 'Holds Securities'
    elif (row['Securities Account'] == 0) & (row ['CD Account'] == 1):
        return 'Holds Deposits'
In [31]:
data['Account_holder_category'] = data.apply(Security_CD , axis = 1)
In [32]:
data.head()
Out[32]:
Age Income Family CCAvg Education Mortgage Personal Loan Securities Account CD Account Online CreditCard Edu_Mark Account_holder_category
0 25 49 4 1.6 1 0 0 1 0 0 0 Undergrad Holds Securities
1 45 34 3 1.5 1 0 0 1 0 0 0 Undergrad Holds Securities
2 39 11 1 1.0 1 0 0 0 0 0 0 Undergrad Doesn't hold Securities and Deposits
3 35 100 1 2.7 2 0 0 0 0 0 0 Graduate Doesn't hold Securities and Deposits
4 35 45 4 1.0 2 0 0 0 0 0 1 Graduate Doesn't hold Securities and Deposits
In [33]:
values = data['Account_holder_category'].value_counts()
In [34]:
fig = px.pie(data , values = values , names = values.index , title = 'Customers with securities and deposit' , hole = 0.5) 
fig.show()
In [35]:
px.box( data , x = 'Education' , y = 'Income' , facet_col= 'Personal Loan',
       title = 'Customer on the basis of Education status and Income for Personal Loan')
In [ ]:
 
In [36]:
def plot(col1,col2 , label1, label2 , title):
    plt.figure(figsize=(12,8))
    sns.distplot(data[data[col2] == 0][col1] , hist = False , label = label1)
    sns.distplot(data[data[col2] == 1][col1] , hist = False , label = label2)
    plt.legend()
    plt.title (title)
In [37]:
plot ('Income' , 'Personal Loan' , 'Customers Income not availing Personal Loan','Customers Income availing Personal Loan' ,'Income Distribution')
In [38]:
plot ('CCAvg' , 'Personal Loan' , 'Credit Card Avg with availing no Personal Loan',
      'Credit Card Avg with availing Personal Loan' ,'Credit Card Avg Distribution')
In [39]:
plot ('Mortgage' , 'Personal Loan' , 'Mortgage with no Personal Loan',
      'Mortgage with Personal Loan' ,'Mortgage Distribution')
In [40]:
col_names = ['Securities Account' , 'Account_holder_category' , 'CreditCard' , 'Online']
In [41]:
for i in col_names:
    plt.figure(figsize=(10,5))
    sns.countplot( x = i , hue = 'Personal Loan' , data = data)
    plt.title ('categories of customer on the basis of {}'. format(i))
In [42]:
sns.scatterplot(data['Age'],data['Personal Loan'],hue = data['Family'])
plt.title ("Age and Personal Loan Relationship")
Out[42]:
Text(0.5, 1.0, 'Age and Personal Loan Relationship')
In [43]:
Ho = "Age Doesn't have impact on personal loan"
Ha = "Age Does have impact on personal loan"
In [44]:
def hypo(col1,col2,H0,HA):
    arr1 = np.array(data[data[col1] == 0][col2])
    arr2 = np.array(data[data[col1] == 1][col2])
    t,p_value = stats.ttest_ind(arr1 , arr2 , axis = 0)
    if p_value < 0.05:
        print (HA , ' as the p value is less then 0.05 with a value of {}' .format(p_value))
    else:
        print (H0 , ' as the p value is more then 0.05 with a value of {}' .format(p_value))
    sns.scatterplot(data[col1],data[col2])
    plt.title (col1 + " and " +col2+ " Relation"  )
In [45]:
hypo('Personal Loan','Age', Ho, Ha )
Age Doesn't have impact on personal loan  as the p value is more then 0.05 with a value of 0.584959263705325
In [46]:
Ho = "Income Doesn't have impact on personal loan"
Ha = "Income Does have impact on personal loan"
hypo('Personal Loan','Income', Ho, Ha)
Income Does have impact on personal loan  as the p value is less then 0.05 with a value of 0.0
In [47]:
Ho = "Family Doesn't have impact on personal loan"
Ha = "Family Does have impact on personal loan"
hypo('Personal Loan','Family', Ho, Ha)
Family Does have impact on personal loan  as the p value is less then 0.05 with a value of 1.4099040685673807e-05